#Pollutants EXploratory Data Analysis
rm(list = ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(imputeTS)
library(forecast)
#Read Files
CO <- read.csv('/Users//SallyA/Documents/Capstone CKME/Data Files/CO.csv')
NO <- read.csv('/Users//SallyA/Documents/Capstone CKME/Data Files/NO.csv')
NO2 <- read.csv('/Users//SallyA/Documents/Capstone CKME/Data Files/NO2.csv')
NOX <- read.csv('/Users//SallyA/Documents/Capstone CKME/Data Files/NOX.csv')
O3 <- read.csv('/Users//SallyA/Documents/Capstone CKME/Data Files/O3.csv')
PM25 <- read.csv('/Users//SallyA/Documents/Capstone CKME/Data Files/PM25.csv')
#It appears that there are 9999.00 and -999.00 and NAs in the data for missing values
# Omitting all NAs that are in rows at the end of the data frames
CO <- na.omit(CO)
NO <- na.omit(NO)
NO2 <- na.omit(NO2)
NOX <- na.omit(NOX)
O3 <- na.omit(O3)
PM25 <- na.omit(PM25)
# Replace all 9999.00s and -999.00s with NA
CO[CO==9999] <- NA
CO[CO==-999] <- NA
NO[NO==9999] <- NA
NO[NO==-999] <- NA
NO2[NO2==9999] <- NA
NO2[NO2==-999] <- NA
NOX[NOX==9999] <- NA
NOX[NOX==-999] <- NA
O3[O3==9999] <- NA
O3[O3==-999] <- NA
PM25[PM25==9999] <- NA
PM25[PM25==-999] <- NA
#Remove the first column from all dataframes as it is no necessary
CO$X = NULL
NO$X = NULL
NO2$X = NULL
NOX$X = NULL
O3$X = NULL
PM25$X = NULL
#Converting Hour Columns to Rows of Data
New_CO <- CO %>%
pivot_longer(-Date, names_to = "Hour", values_to = "CO")
New_NO <- NO %>%
pivot_longer(-Date, names_to = "Hour", values_to = "NO")
New_NO2 <- NO2 %>%
pivot_longer(-Date, names_to = "Hour", values_to = "NO2")
New_NOX <- NOX %>%
pivot_longer(-Date, names_to = "Hour", values_to = "NOX")
New_O3 <- O3 %>%
pivot_longer(-Date, names_to = "Hour", values_to = "O3")
New_PM25 <- PM25 %>%
pivot_longer(-Date, names_to = "Hour", values_to = "PM25")
#After using str(DF) it become evident that the Date column is changed to factor, in this line, converting the
## Date back to Date
New_CO$Date <- as.Date(New_CO$Date)
New_NO$Date <- as.Date(New_NO$Date)
New_NO2$Date <- as.Date(New_NO2$Date)
New_NOX$Date <- as.Date(New_NOX$Date)
New_O3$Date <- as.Date(New_O3$Date)
New_PM25$Date <- as.Date(New_PM25$Date)
#Reviewing the dataframes
summary(New_CO)
## Date Hour CO
## Min. :2006-01-01 Length:43824 Min. :0.0000
## 1st Qu.:2007-04-02 Class :character 1st Qu.:0.1100
## Median :2008-07-01 Mode :character Median :0.2000
## Mean :2008-07-01 Mean :0.2096
## 3rd Qu.:2009-10-01 3rd Qu.:0.2800
## Max. :2010-12-31 Max. :1.7000
## NA's :1897
summary(New_NO)
## Date Hour NO
## Min. :2006-01-01 Length:43824 Min. : 0.000
## 1st Qu.:2007-04-02 Class :character 1st Qu.: 1.000
## Median :2008-07-01 Mode :character Median : 2.000
## Mean :2008-07-01 Mean : 5.421
## 3rd Qu.:2009-10-01 3rd Qu.: 5.000
## Max. :2010-12-31 Max. :260.000
## NA's :206
summary(New_NO2)
## Date Hour NO2
## Min. :2006-01-01 Length:43824 Min. : 0.00
## 1st Qu.:2007-04-02 Class :character 1st Qu.:10.00
## Median :2008-07-01 Mode :character Median :15.00
## Mean :2008-07-01 Mean :17.39
## 3rd Qu.:2009-10-01 3rd Qu.:23.00
## Max. :2010-12-31 Max. :75.00
## NA's :200
summary(New_NOX)
## Date Hour NOX
## Min. :2006-01-01 Length:43824 Min. : 0.00
## 1st Qu.:2007-04-02 Class :character 1st Qu.: 12.00
## Median :2008-07-01 Mode :character Median : 18.00
## Mean :2008-07-01 Mean : 22.84
## 3rd Qu.:2009-10-01 3rd Qu.: 28.00
## Max. :2010-12-31 Max. :334.00
## NA's :200
summary(New_O3)
## Date Hour O3
## Min. :2006-01-01 Length:43824 Min. : 0.00
## 1st Qu.:2007-04-02 Class :character 1st Qu.: 14.00
## Median :2008-07-01 Mode :character Median : 24.00
## Mean :2008-07-01 Mean : 24.99
## 3rd Qu.:2009-10-01 3rd Qu.: 34.00
## Max. :2010-12-31 Max. :102.00
## NA's :281
summary(New_PM25)
## Date Hour PM25
## Min. :2006-01-01 Length:43824 Min. : 0.000
## 1st Qu.:2007-04-02 Class :character 1st Qu.: 2.000
## Median :2008-07-01 Mode :character Median : 5.000
## Mean :2008-07-01 Mean : 6.548
## 3rd Qu.:2009-10-01 3rd Qu.: 9.000
## Max. :2010-12-31 Max. :52.000
## NA's :308
#Histrograms
hist(x=New_CO$CO, col = "blue", breaks= 50, xlab = "CO Levels", main = "Hourly Numbers")

hist(x=New_NO$NO, col = "blue", breaks= 50, xlab = "NO Levels", main = "Hourly Numbers")

hist(x=New_NO2$NO2, col = "blue", breaks= 20, xlab = "NO2 Levels", main = "Hourly Numbers")

hist(x=New_NOX$NOX, col = "blue", breaks= 50, xlab = "NOX Levels", main = "Hourly Numbers")

hist(x=New_O3$O3, col = "blue", breaks= 10, xlab = "O3 Levels", main = "Hourly Numbers")

hist(x=New_PM25$PM25, col = "blue", breaks= 10, xlab = "PM2.5 Levels", main = "Hourly Numbers")

#Plotting the Pollutants
g_CO <- ggplot(New_CO, aes(x=Date, y=CO))
p_CO <- g_CO + geom_line() + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + theme_minimal()
print(p_CO)

g_NO <- ggplot(New_NO, aes(x=Date, y=NO))
p_NO <- g_NO + geom_line() + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + theme_minimal()
print(p_NO)

g_NO2 <- ggplot(New_NO2, aes(x=Date, y=NO2))
p_NO2 <- g_NO2 + geom_line() + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + theme_minimal()
print(p_NO2)

g_NOX <- ggplot(New_NOX, aes(x=Date, y=NOX))
p_NOX <- g_NOX + geom_line() + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + theme_minimal()
print(p_NOX)

g_O3 <- ggplot(New_O3, aes(x=Date, y=O3))
p_O3 <- g_O3 + geom_line() + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + theme_minimal()
print(p_O3)

g_PM25 <- ggplot(New_PM25, aes(x=Date, y=PM25))
p_PM25 <- g_PM25 + geom_line() + scale_x_date(date_labels = "%Y", date_breaks = "1 year") + theme_minimal()
print(p_PM25)

#Statistics on NAs
t_CO <- New_CO$CO
t_NO <- New_NO$NO
t_NO2 <- New_NO2$NO2
t_NOX <- New_NOX$NOX
t_O3 <- New_O3$O3
t_PM25 <- New_PM25$PM25
statsNA(t_CO)
## [1] "Length of time series:"
## [1] 43824
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 1897
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "4.33%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (10956 values from 1 to 10956) : 244 NAs (2.23%)"
## [1] " Bin 2 (10956 values from 10957 to 21912) : 1272 NAs (11.6%)"
## [1] " Bin 3 (10956 values from 21913 to 32868) : 173 NAs (1.58%)"
## [1] " Bin 4 (10956 values from 32869 to 43824) : 208 NAs (1.9%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "948 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 132 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "948 NA in a row (occuring 1 times, making up for overall 948 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 132 times"
## [1] " 2 NA in a row: 23 times"
## [1] " 3 NA in a row: 9 times"
## [1] " 4 NA in a row: 4 times"
## [1] " 5 NA in a row: 1 times"
## [1] " 6 NA in a row: 1 times"
## [1] " 7 NA in a row: 2 times"
## [1] " 8 NA in a row: 1 times"
## [1] " 11 NA in a row: 1 times"
## [1] " 13 NA in a row: 1 times"
## [1] " 16 NA in a row: 1 times"
## [1] " 20 NA in a row: 1 times"
## [1] " 61 NA in a row: 1 times"
## [1] " 110 NA in a row: 1 times"
## [1] " 184 NA in a row: 1 times"
## [1] " 280 NA in a row: 1 times"
## [1] " 948 NA in a row: 1 times"
statsNA(t_NO)
## [1] "Length of time series:"
## [1] 43824
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 206
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.47%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (10956 values from 1 to 10956) : 62 NAs (0.566%)"
## [1] " Bin 2 (10956 values from 10957 to 21912) : 73 NAs (0.666%)"
## [1] " Bin 3 (10956 values from 21913 to 32868) : 25 NAs (0.228%)"
## [1] " Bin 4 (10956 values from 32869 to 43824) : 46 NAs (0.42%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "32 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 35 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "2 NA in a row (occuring 19 times, making up for overall 38 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 35 times"
## [1] " 2 NA in a row: 19 times"
## [1] " 3 NA in a row: 5 times"
## [1] " 4 NA in a row: 2 times"
## [1] " 6 NA in a row: 2 times"
## [1] " 7 NA in a row: 1 times"
## [1] " 9 NA in a row: 2 times"
## [1] " 11 NA in a row: 1 times"
## [1] " 30 NA in a row: 1 times"
## [1] " 32 NA in a row: 1 times"
statsNA(t_NO2)
## [1] "Length of time series:"
## [1] 43824
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 200
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.456%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (10956 values from 1 to 10956) : 62 NAs (0.566%)"
## [1] " Bin 2 (10956 values from 10957 to 21912) : 73 NAs (0.666%)"
## [1] " Bin 3 (10956 values from 21913 to 32868) : 25 NAs (0.228%)"
## [1] " Bin 4 (10956 values from 32869 to 43824) : 40 NAs (0.365%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "32 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 34 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "2 NA in a row (occuring 18 times, making up for overall 36 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 34 times"
## [1] " 2 NA in a row: 18 times"
## [1] " 3 NA in a row: 4 times"
## [1] " 4 NA in a row: 2 times"
## [1] " 6 NA in a row: 2 times"
## [1] " 7 NA in a row: 1 times"
## [1] " 9 NA in a row: 2 times"
## [1] " 11 NA in a row: 1 times"
## [1] " 30 NA in a row: 1 times"
## [1] " 32 NA in a row: 1 times"
statsNA(t_NOX)
## [1] "Length of time series:"
## [1] 43824
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 200
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.456%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (10956 values from 1 to 10956) : 62 NAs (0.566%)"
## [1] " Bin 2 (10956 values from 10957 to 21912) : 73 NAs (0.666%)"
## [1] " Bin 3 (10956 values from 21913 to 32868) : 25 NAs (0.228%)"
## [1] " Bin 4 (10956 values from 32869 to 43824) : 40 NAs (0.365%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "32 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 34 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "2 NA in a row (occuring 18 times, making up for overall 36 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 34 times"
## [1] " 2 NA in a row: 18 times"
## [1] " 3 NA in a row: 4 times"
## [1] " 4 NA in a row: 2 times"
## [1] " 6 NA in a row: 2 times"
## [1] " 7 NA in a row: 1 times"
## [1] " 9 NA in a row: 2 times"
## [1] " 11 NA in a row: 1 times"
## [1] " 30 NA in a row: 1 times"
## [1] " 32 NA in a row: 1 times"
statsNA(t_O3)
## [1] "Length of time series:"
## [1] 43824
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 281
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.641%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (10956 values from 1 to 10956) : 47 NAs (0.429%)"
## [1] " Bin 2 (10956 values from 10957 to 21912) : 37 NAs (0.338%)"
## [1] " Bin 3 (10956 values from 21913 to 32868) : 150 NAs (1.37%)"
## [1] " Bin 4 (10956 values from 32869 to 43824) : 47 NAs (0.429%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "122 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 42 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "122 NA in a row (occuring 1 times, making up for overall 122 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 42 times"
## [1] " 2 NA in a row: 23 times"
## [1] " 3 NA in a row: 1 times"
## [1] " 4 NA in a row: 2 times"
## [1] " 5 NA in a row: 1 times"
## [1] " 6 NA in a row: 1 times"
## [1] " 7 NA in a row: 2 times"
## [1] " 11 NA in a row: 1 times"
## [1] " 24 NA in a row: 1 times"
## [1] " 122 NA in a row: 1 times"
statsNA(t_PM25)
## [1] "Length of time series:"
## [1] 43824
## [1] "-------------------------"
## [1] "Number of Missing Values:"
## [1] 308
## [1] "-------------------------"
## [1] "Percentage of Missing Values:"
## [1] "0.703%"
## [1] "-------------------------"
## [1] "Stats for Bins"
## [1] " Bin 1 (10956 values from 1 to 10956) : 68 NAs (0.621%)"
## [1] " Bin 2 (10956 values from 10957 to 21912) : 56 NAs (0.511%)"
## [1] " Bin 3 (10956 values from 21913 to 32868) : 59 NAs (0.539%)"
## [1] " Bin 4 (10956 values from 32869 to 43824) : 125 NAs (1.14%)"
## [1] "-------------------------"
## [1] "Longest NA gap (series of consecutive NAs)"
## [1] "14 in a row"
## [1] "-------------------------"
## [1] "Most frequent gap size (series of consecutive NA series)"
## [1] "1 NA in a row (occuring 114 times)"
## [1] "-------------------------"
## [1] "Gap size accounting for most NAs"
## [1] "1 NA in a row (occuring 114 times, making up for overall 114 NAs)"
## [1] "-------------------------"
## [1] "Overview NA series"
## [1] " 1 NA in a row: 114 times"
## [1] " 2 NA in a row: 39 times"
## [1] " 3 NA in a row: 7 times"
## [1] " 4 NA in a row: 3 times"
## [1] " 5 NA in a row: 4 times"
## [1] " 6 NA in a row: 4 times"
## [1] " 7 NA in a row: 2 times"
## [1] " 11 NA in a row: 1 times"
## [1] " 14 NA in a row: 1 times"
#Convert to TS and plotting in hourly basis
TS_CO <- ts(t_CO, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_CO)

TS_NO <- ts(t_NO, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_NO)

TS_NO2 <- ts(t_NO2, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_NO2)

TS_NOX <- ts(t_NOX, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_NOX)

TS_O3 <- ts(t_O3, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_O3)

TS_PM25 <- ts(t_PM25, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_PM25)

#Imputation Procedure
#First Step Plot all NAs
par(mfrow=c(3,3))
plotNA.distribution(TS_CO, xlab = "Year", ylab = "CO")
plotNA.distribution(TS_NO, xlab = "Year", ylab = "NO")
plotNA.distribution(TS_NO2, xlab = "Year", ylab = "NO2")
plotNA.distribution(TS_NOX, xlab = "Year", ylab = "NOX")
plotNA.distribution(TS_O3, xlab = "Year", ylab = "O3")
plotNA.distribution(TS_PM25, xlab = "Year", ylab = "PM25")
#distribution of NAs in time series
par(mfrow=c(1,1))

plotNA.distributionBar(TS_CO, ylab = "CO")

plotNA.distributionBar(TS_NO, ylab = "NO")

plotNA.distributionBar(TS_NO2, ylab = "NO2")

plotNA.distributionBar(TS_NOX, ylab = "NOX")

plotNA.distributionBar(TS_O3, ylab = "O3")

plotNA.distributionBar(TS_PM25, ylab = "PM25")

#Gap size plot
par(mfrow=c(1,1))
plotNA.gapsize(t_CO)

plotNA.gapsize(t_NO)

plotNA.gapsize(t_NO2)

plotNA.gapsize(t_NOX)
plotNA.gapsize(t_O3)

plotNA.gapsize(t_PM25)

#NA Interpolation Impute Linear, Spline, vs Stinterp
intp_CO <- na_interpolation(t_CO, option = "linear")
intp_NO <- na_interpolation(t_NO, option = "linear")
intp_NO2 <- na_interpolation(t_NO2, option = "linear")
intp_NOX <- na_interpolation(t_NOX, option = "linear")
intp_O3 <- na_interpolation(t_O3, option = "linear")
intp_PM25 <- na_interpolation(t_PM25, option = "linear")
#Plotting imputed datasets - hourly time series to check imputation
TS_IMCO <- ts(intp_CO, start = c(2006, 1), end = c(2010, 12), frequency = 24*365)
plot(TS_IMCO)

#Since the data is multiseasonal this is how to convert it into time series
mts_CO <- msts(intp_CO, seasonal.periods = 8766, start=2006)
mts_NO <- msts(intp_NO, seasonal.periods = 8766, start=2006)
mts_NO2 <- msts(intp_NO2, seasonal.periods = 8766, start=2006)
mts_NOX <- msts(intp_NOX, seasonal.periods = 8766, start=2006)
mts_O3 <- msts(intp_O3, seasonal.periods = 8766, start=2006)
mts_PM25 <- msts(intp_PM25, seasonal.periods = 8766, start=2006)
#Decomposition of Seasonality, Trends and Random factor from TS
plot(decompose(mts_CO))

plot(decompose(mts_NO))

plot(decompose(mts_NO2))

plot(decompose(mts_NOX))

plot(decompose(mts_O3))

plot(decompose(mts_PM25))
